
#############################################################
### Geographic Representativeness of Legislatures Project ###
###                                                       ###
###           Leonardo Carella, Andrew Eggers             ###
#############################################################

# This is the SECOND script to run to replicate the cross-country analysis. 
# It computes SURLI scores with the rotated CDF method (4 increments),
# for all countries, using both 2005 population and mean MP
# birthyear population as benchmarks. 

# It requires three datasets as inputs:

# 1. glp_complete.rds: the dataset of MPs' geocoded birthplaces.
# 2. grids_complete.rds: the gridded population dataset for 2005.
# 3. grids_2005_15_arcmin.rds: the gridded population dataset for the mean legislator birthyear.

# It produces two datasets as outputs

# 1. cdfds_w_surlis_4_increments_2005.rds: real parliament and ‘fake parliament’
# discrepancy scores across 500 simulations (2005 benchmark)

# 2. cdfds_w_surlis_4_increments_birthyear.rds: real parliament and ‘fake parliament’
# discrepancy scores across 500 simulations (mean MP birthyear benchmark)

#### Packages ####

library(tidyverse)
library(raster)
library(sp)
library(parallel)
library(move)

#### Functions ####

prep_data_for_EMD <- function(country_name, arcmin, random_subset_fraction = 1) {
  #creates country subsets for MP and grids
  # this function relies on there being gpl_complete and grids_complete objects in memory -- not ideal!
  df1 <- glp_complete %>%
    dplyr::filter(country == country_name)
  if(random_subset_fraction < 1){
    stopifnot(is.numeric(random_subset_fraction))
    df1 <- df1 %>% 
      slice_sample(prop = random_subset_fraction)
  }
  df2 <- grids_complete %>%
    dplyr::filter(country == country_name, gridsize == arcmin)
  if(nrow(df2) <= 1) {stop("Fewer than 2 grids available. Try lower arcmin parameter")}
  else {
    bpmatrix <- as.matrix(cbind(as.numeric(df1$bpcoords.lon), as.numeric(df1$bpcoords.lat)))
    gmatrix <- as.matrix(cbind(as.numeric(df2$s1), as.numeric(df2$s2)), na.omit == T)
    a <- raster::pointDistance(bpmatrix, gmatrix, lonlat=TRUE, allpairs = TRUE) #added the allpairs parameter: relevant when number of grids = number of MPs.
    b <- apply(a, 1, which.min)
    k <- unlist(b)
    p <- data.frame(df2$gID[k], df2$s1[k], df2$s2[k], df1$person_nid)
    countdf <- as.data.frame(table(p$df2.gID.k.))
    countdf$gID <- countdf$Var1 
    output <- merge(df2, countdf, by = "gID", all = TRUE)
    output$Freq[is.na(output$Freq)] <- 0
    output$popshare <- output$pop/sum(output$pop)
    output$MPshare <- output$Freq/sum(output$Freq)
    return(output)}
}

get_real_emd <- function(country_name, arcmin, noisy = F, random_subset_fraction = 1) {
  if(noisy){cat(country_name, "\n")}
  output <- prep_data_for_EMD(country_name, arcmin, random_subset_fraction)
  popsp <- sp::SpatialPointsDataFrame(coords = cbind(output$s1, output$s2), data = data.frame(output$popshare))
  mpsp <- sp::SpatialPointsDataFrame(coords = cbind(output$s1, output$s2), data = data.frame(output$MPshare))
  return(as.numeric(move::emd(popsp,mpsp)))}

get_fake_emd <- function(country_name, arcmin, seed, random_subset_fraction = 1){
  output <- prep_data_for_EMD(country_name, arcmin, random_subset_fraction)
  set.seed(seed)
  x <- as.data.frame(table(sample(length(output$gID), sum(output$Freq), 
                                  prob = output$pop, replace = TRUE)))
  names(x)[names(x) == "Var1"] <- "gIDnew"
  names(x)[names(x) == "Freq"] <- "simulFreq"
  output$gIDnew <- 1:length(output$gID)
  simulOUTPUT <- merge(output, x, by = "gIDnew", all = TRUE)
  simulOUTPUT$simulFreq[is.na(simulOUTPUT$simulFreq)] <- 0
  simulOUTPUT$popshare <- simulOUTPUT$pop/sum(simulOUTPUT$pop)
  simulOUTPUT$MPshare <- simulOUTPUT$simulFreq/sum(simulOUTPUT$simulFreq)
  #computes EMDs
  popsp <- sp::SpatialPointsDataFrame(coords = cbind(simulOUTPUT$s1, simulOUTPUT$s2), data = data.frame(simulOUTPUT$popshare))
  mpsp <- sp::SpatialPointsDataFrame(coords = cbind(simulOUTPUT$s1, simulOUTPUT$s2), data = data.frame(simulOUTPUT$MPshare))
  return(as.numeric(move::emd(popsp,mpsp)))
}

weighted.quantile = function(vals, probs, weights = NULL){
  if(is.null(weights)){return(quantile(vals, probs))}
  vals = vals[order(vals)]
  weights = weights[order(vals)]
  cs.weights = cumsum(weights)
  # we want to compute 
  # how do we vectorize this, for when there are multiple probs to check? 
  # this is the cheapo way
  out = rep(NA, length(probs)) 
  for(i in 1:length(probs)){
    out[i] <- min(vals[cs.weights > probs[i]])
  }
  out[is.infinite(out)] <- max(vals)
  out
}

cdf_overlap_horizontal = function(x.vals, x.weights, y.vals, y.weights, cols = gray.colors(4), plot.ecdf = T, plot.boxes = T, noisy = F, x.series.lab = "CDF of X", y.series.lab = "CDF of Y", main = NULL){
  # sort the vals and weights 
  x.weights = x.weights[order(x.vals)]; y.weights = y.weights[order(y.vals)]
  x.vals = x.vals[order(x.vals)]; y.vals = y.vals[order(y.vals)]
  # all y-values where there is a step in either ECDF
  all.probs = sort(c(0, cumsum(x.weights), cumsum(y.weights))) 
  # now we need the height of the two ECDFs at these y-values points
  x.cuts = suppressWarnings(weighted.quantile(vals = x.vals, probs = all.probs, weights = x.weights))
  y.cuts = suppressWarnings(weighted.quantile(vals = y.vals, probs = all.probs, weights = y.weights))
  # weighted.quantile sometimes produced Inf. This happened at the top of the distribution, because there were no vals where the cumsum of the weights were greater than probs[i]. I now set those to the maximum value of vals.  
  include <- !is.infinite(x.cuts) & !is.infinite(y.cuts)
  if(sum(include) < length(x.cuts)){
    warning("Some values of x.cuts or y.cuts infinite and thus excluded.")
  }
  all.probs <- all.probs[include]
  x.cuts <- x.cuts[include]
  y.cuts <- y.cuts[include]
  # calculate the area
  the.area = 0
  for(i in 2:(length(all.probs) - 1)){
    this.rectangle.area = (all.probs[i] - all.probs[i-1])*abs(x.cuts[i-1] - y.cuts[i-1])
    the.area = the.area + this.rectangle.area
    if(noisy){cat("Adding rectangle of area ", round(this.rectangle.area, 4), "\n")}
  }
  if(plot.ecdf | plot.boxes){
    plot(1.25*range(c(0, x.vals, y.vals)), c(0,1), type = "n", axes = F, xlab = "", ylab = "Proportion below", main = main)
    axis(1); axis(2)
  }
  if(plot.boxes){
    # plot the rectangles
    for(i in 2:(length(all.probs) - 1)){
      this.rectangle.area = (all.probs[i] - all.probs[i-1])*abs(x.cuts[i-1] - y.cuts[i-1])
      the.area = the.area + this.rectangle.area
      if(noisy){cat("Adding rectangle of area ", round(this.rectangle.area, 4), "\n")}
      polygon(c(x.cuts[i-1], y.cuts[i-1], y.cuts[i-1], x.cuts[i-1]), c(rep(all.probs[i], 2), rep(all.probs[i-1], 2)), col = cols[1 + i%%length(cols)], border = NA)
    }
  }
  if(plot.ecdf){
    # plot the weighted ECDF
    big = 1000000
    xs.to.plot = as.vector(c(-big, t(cbind(x.vals, x.vals)), big))
    x.cs.1 = cumsum(x.weights)
    x.weights.to.plot = c(0, 0, as.vector(t(cbind(x.cs.1[-length(x.weights)], x.cs.1[-length(x.weights)]))), 1, 1)
    lines(xs.to.plot, x.weights.to.plot, type = "o", lty = 1, pch = 19, cex = .5)
    ys.to.plot = as.vector(c(-big, t(cbind(y.vals, y.vals)), big))
    y.cs.1 = cumsum(y.weights)
    y.weights.to.plot = c(0, 0, as.vector(t(cbind(y.cs.1[-length(y.weights)], y.cs.1[-length(y.weights)]))), 1, 1)
    lines(ys.to.plot, y.weights.to.plot, type = "o", lty = 2, pch = 1, cex = .5)
    # legend
    legend("topleft", lty = c(1,2), pch = c(19, 1), cex = c(.5, .5), legend = c(x.series.lab, y.series.lab), seg.len = 5)
  }
  # return the area
  the.area
}

rotated_1d_cdf_discrepancy <- function(grid_coords, sig1, sig2, increments, recenter_at = "mean_of_means", normalize = F, raw_data = F)
{stopifnot(ncol(grid_coords) == 2)
  stopifnot(length(sig1) == length(sig2))
  stopifnot(length(sig1) == nrow(grid_coords))
  
  # normalize
  if(normalize){
    sig1 <- sig1/sum(sig1)
    sig2 <- sig2/sum(sig2)
  }
  
  if(recenter_at == "mean_of_means"){
    x_mean_of_means <- mean(sig1 %*% grid_coords[,1], sig2 %*% grid_coords[,1])
    y_mean_of_means <- mean(sig1 %*% grid_coords[,2], sig2 %*% grid_coords[,2])   
    new_center <- c(x_mean_of_means, y_mean_of_means)
  }else{stop("recenter_at argument must be 'mean_of_means' for now.\n")}
  
  # recenter
  grid_coords <- grid_coords - matrix(new_center, nrow = nrow(grid_coords), ncol = 2, byrow = T)
  
  angles <- seq(0, pi, length = increments+1)[-1] # why take off the first one? because it is same as the last one. 
  cdf_discreps <- rep(NA, length(angles))
  for(i in 1:length(angles)){
    angle <- angles[i]
    rotation_matrix <- matrix(c(cos(angle), sin(angle), -sin(angle), cos(angle)), nrow = 2, ncol = 2)
    these_grid_coords <- (as.matrix(grid_coords)) %*% rotation_matrix
    cdf_discreps[i] <- cdf_overlap_horizontal(these_grid_coords[,1], sig1, these_grid_coords[,1], sig2, plot.ecdf = F, plot.boxes = F, noisy = F)
  }
  the_raw_data <- data.frame(angle = angles, cdf_discrep = cdf_discreps)
  if(raw_data){
    the_raw_data
  }else{
    mean(the_raw_data$cdf_discrep)
  }
}

get_real_rotated_cdf_discrepancy <- function(country_name, arcmin, increment, noisy = F,  random_subset_fraction = 1) {
  if(noisy){cat(country_name, ", increment = ", increment, "\n", sep = "")}
  data <- prep_data_for_EMD(country_name, arcmin, random_subset_fraction)
  popsp <- sp::SpatialPointsDataFrame(coords = cbind(data$s1, data$s2), data = data.frame(data$popshare))
  mpsp <- sp::SpatialPointsDataFrame(coords = cbind(data$s1, data$s2), data = data.frame(data$MPshare))
  df1 <- as.data.frame(popsp)
  df2 <- as.data.frame(mpsp)
  grid_coords <- as.data.frame(df1[-1])
  sig1 <- df1[,1]
  sig2 <- df2[,1]
  rcdf <- rotated_1d_cdf_discrepancy(grid_coords, sig1, sig2, increments = increment, recenter_at = "mean_of_means", normalize = F, raw_data = F)
  return(rcdf)
}

get_fake_rotated_cdf_discrepancy_given_grid_squares <- function(grid_squares, increment = 4, alpha = 1, leg_size = 100){
  grid_squares %>% 
    filter(!is.na(pop) & pop > 0) %>% 
    mutate(raw_probs = pop^alpha,
           probs = raw_probs/sum(raw_probs),
           popshare = pop/sum(pop,na.rm = T)) -> interim 
  
  if(nrow(interim) == 0){cat("No grid squares with positive population?\n"); return(NA)}
  interim %>% 
    mutate(legislators = as.numeric(rmultinom(n = 1, size = leg_size, prob = probs)),
           leg_share = legislators/leg_size) -> x
  
  popsp <- sp::SpatialPointsDataFrame(coords = cbind(x$s1, x$s2), data = data.frame(x$popshare)) %>% as.data.frame()
  mpsp <- sp::SpatialPointsDataFrame(coords = cbind(x$s1, x$s2), data = data.frame(x$leg_share)) %>% as.data.frame()
  
  rotated_1d_cdf_discrepancy(grid_coords = popsp[,-1], sig1 = popsp[,1], sig2 = mpsp[,1], increments = increment, recenter_at = "mean_of_means", normalize = F, raw_data = F)
}

get_fake_rotated_cdf_discrepancy <- function(country_name, arcmin, increment, seed = NULL, random_subset_fraction = 1, alpha = 1, leg_size = NULL) {
  output <- prep_data_for_EMD(country_name, arcmin, random_subset_fraction)
  if(!is.null(seed)){set.seed(seed)}
  if(is.null(leg_size)){leg_size = sum(output$Freq)}
  # counterfactual count of MPs per grid square, summing up to actual count in `output` or leg_size if provided
  x <- as.data.frame(table(sample(length(output$gID), leg_size, 
                                  prob = (output$pop)^alpha, replace = TRUE)))
  names(x)[names(x) == "Var1"] <- "gIDnew"
  names(x)[names(x) == "Freq"] <- "simulFreq"
  output$gIDnew <- 1:length(output$gID)
  simulOUTPUT <- merge(output, x, by = "gIDnew", all = TRUE)
  simulOUTPUT$simulFreq[is.na(simulOUTPUT$simulFreq)] <- 0
  simulOUTPUT$popshare <- simulOUTPUT$pop/sum(simulOUTPUT$pop)
  simulOUTPUT$MPshare <- simulOUTPUT$simulFreq/sum(simulOUTPUT$simulFreq)
  #computes EMDs
  popsp <- sp::SpatialPointsDataFrame(coords = cbind(simulOUTPUT$s1, simulOUTPUT$s2), data = data.frame(simulOUTPUT$popshare))
  mpsp <- sp::SpatialPointsDataFrame(coords = cbind(simulOUTPUT$s1, simulOUTPUT$s2), data = data.frame(simulOUTPUT$MPshare))
  df1 <- as.data.frame(popsp)
  df2 <- as.data.frame(mpsp)
  grid_coords <- as.data.frame(df1[-1])
  sig1 <- df1[,1]
  sig2 <- df2[,1]
  rcdf <- rotated_1d_cdf_discrepancy(grid_coords, sig1, sig2, increments = increment, recenter_at = "mean_of_means", normalize = F, raw_data = F)
  return(rcdf)
}

get_SURLI_emd_method <- function(country_name, arcmin, n_iter, output_raw = FALSE) {
  # this is for comparison -- not used
  real_emd <- get_real_emd(country_name = country_name, arcmin = arcmin)
  fake_emd <- as.matrix(parallel::mclapply(1:n_iter, get_fake_emd, country_name = country_name, arcmin = arcmin), ncol = 1)
  if(output_raw){return(list(real_emd = real_emd, fake_emd = fake_emd))}
  zscore <- (real_emd - mean(as.numeric(fake_emd)))/sd(fake_emd)
  return(zscore)}

get_SURLI_cdf_method <- function(country_name, arcmin, n_iter, increment, output_raw = FALSE, noisy = F, random_subset_fraction = 1, alpha = 1) {
  real_cdfd <- get_real_rotated_cdf_discrepancy(country_name = country_name, 
                                                arcmin = arcmin, increment = increment, random_subset_fraction = random_subset_fraction)
  fake_cdfd <- as.matrix(parallel::mclapply(1:n_iter, get_fake_rotated_cdf_discrepancy, country_name = country_name, 
                                            arcmin = arcmin, increment = increment, random_subset_fraction = random_subset_fraction, alpha = alpha), ncol = 1)
  if(noisy){cat("Completed ", country_name, ", subset fraction ", random_subset_fraction, ", alpha = ", alpha, "\n", sep = "")}
  if(output_raw){return(list(real_emd = real_cdfd, fake_emd = as.numeric(fake_cdfd)))}
  zscore <- (real_cdfd - mean(as.numeric(fake_cdfd)))/sd(fake_cdfd)
  return(zscore)
}

#### Analysis ####

path_to_data <- paste(getwd(), "/", sep = "")

glp_complete <- readRDS(str_c(path_to_data, "glp_complete.rds")) %>% 
  as_tibble()

grids_strings <- c("grids_complete.rds", "grids_2005_15_arcmin.rds")  # grids_complete is birthyear pop distro, grids_2005_15_arcmin is 2005 pop distro
grids_labels <- c("birthyear", "2005")
n_iter <- 500

for(k in 1:length(grids_strings)){
  
  grids_complete <- readRDS(str_c(path_to_data, grids_strings[k])) %>% 
    as_tibble()
  
  # get the set of cases to consider
  grids_complete %>% 
    dplyr::count(country, year) %>% 
    dplyr::rename(grid_squares = n) -> all_cases 

  all_cases %>%
    mutate(arcmin = 15, increment = 4, n_iter = n_iter) %>%
    mutate(out = pmap(dplyr::select(., -year, -grid_squares), get_SURLI_cdf_method, output_raw = T, noisy = T)) %>%
    unnest_wider(out) -> cdfds_w_fakes_4
  
  cdfds_w_fakes_4 %>%
    mutate(log_fake_emd = map(fake_emd, log),
           z_score_log = (log(real_emd) - map_dbl(log_fake_emd, mean))/map_dbl(log_fake_emd, sd),
           z_score_level = (real_emd - map_dbl(fake_emd, mean))/map_dbl(fake_emd, sd)) -> cdfds_w_surlis

  saveRDS(cdfds_w_surlis, file = str_c("cdfds_w_surlis_4_increments_", grids_labels[k], ".RDS"))
  
}
